
// ===============================================================================================================
// -*- C++ -*-
//
// Matrix4x4.cpp - OpenGL compatible utility class for a 4x4 matrix of floats.
//
// Copyright (c) 2011 Guilherme R. Lampert
// guilherme.ronaldo.lampert@gmail.com
//
// This code is licenced under the MIT license.
//
// This software is provided "as is" without express or implied
// warranties. You may freely copy and compile this source into
// applications you distribute provided that the copyright text
// above is included in the resulting source code.
//
// ===============================================================================================================

#include <Matrix4x4.hpp>
#include <memory.h> // For memcpy()

// =========================================================
// Matrix4x4 Class Implementation
// =========================================================

Matrix4x4::Matrix4x4(bool identity)
{
	if (identity)
	{
		LoadIdentity();
	}
}

Matrix4x4::Matrix4x4(const Matrix4x4 & mat)
{
	memcpy(this, &mat, sizeof(Matrix4x4));
}

Matrix4x4& Matrix4x4::operator = (const Matrix4x4 & mat)
{
	memcpy(this, &mat, sizeof(Matrix4x4));
	return (*this);
}

Matrix4x4::Matrix4x4(float m0, float m4, float  m8, float m12,
                     float m1, float m5, float  m9, float m13,
                     float m2, float m6, float m10, float m14,
                     float m3, float m7, float m11, float m15)
{
	m[0] = m0; m[4] = m4; m[8]  = m8;  m[12] = m12;
	m[1] = m1; m[5] = m5; m[9]  = m9;  m[13] = m13;
	m[2] = m2; m[6] = m6; m[10] = m10; m[14] = m14;
	m[3] = m3; m[7] = m7; m[11] = m11; m[15] = m15;
}

void Matrix4x4::LoadIdentity(void)
{
#if (ENABLE_SSE_INTRINSICS)

	col[0] = _mm_setzero_ps();
	col[1] = _mm_setzero_ps();
	col[2] = _mm_setzero_ps();
	col[3] = _mm_setzero_ps();
	m[0] = m[5] = m[10] = m[15] = 1.0f;

#else

	m[0] = 1.0f; m[4] = 0.0f; m[8]  = 0.0f; m[12] = 0.0f;
	m[1] = 0.0f; m[5] = 1.0f; m[9]  = 0.0f; m[13] = 0.0f;
	m[2] = 0.0f; m[6] = 0.0f; m[10] = 1.0f; m[14] = 0.0f;
	m[3] = 0.0f; m[7] = 0.0f; m[11] = 0.0f; m[15] = 1.0f;

#endif // ENABLE_SSE_INTRINSICS
}

void Matrix4x4::LoadZero(void)
{
#if (ENABLE_SSE_INTRINSICS)

	col[0] = _mm_setzero_ps();
	col[1] = _mm_setzero_ps();
	col[2] = _mm_setzero_ps();
	col[3] = _mm_setzero_ps();

#else

	m[0] = 0.0f; m[4] = 0.0f; m[8]  = 0.0f; m[12] = 0.0f;
	m[1] = 0.0f; m[5] = 0.0f; m[9]  = 0.0f; m[13] = 0.0f;
	m[2] = 0.0f; m[6] = 0.0f; m[10] = 0.0f; m[14] = 0.0f;
	m[3] = 0.0f; m[7] = 0.0f; m[11] = 0.0f; m[15] = 0.0f;

#endif // ENABLE_SSE_INTRINSICS
}

void Matrix4x4::Transpose(void)
{
#if (ENABLE_SSE_INTRINSICS)

	__m128 tmp0 = _mm_shuffle_ps(col[0], col[1], 0x44);
	__m128 tmp2 = _mm_shuffle_ps(col[0], col[1], 0xEE);
	__m128 tmp1 = _mm_shuffle_ps(col[2], col[3], 0x44);
	__m128 tmp3 = _mm_shuffle_ps(col[2], col[3], 0xEE);

	col[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
	col[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
	col[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
	col[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);

#else

	SwapFast(m[1 ], m[4 ]);
	SwapFast(m[2 ], m[8 ]);
	SwapFast(m[3 ], m[12]);
	SwapFast(m[9 ], m[6 ]);
	SwapFast(m[7 ], m[13]);
	SwapFast(m[14], m[11]);

#endif // ENABLE_SSE_INTRINSICS
}

void Matrix4x4::TransformPoint(const float in[4], float out[4]) const
{
#if (ENABLE_SSE_INTRINSICS)

	__m128 tmp0 = _mm_mul_ps(col[0], _mm_set_ps1(in[0]));
	__m128 tmp1 = _mm_mul_ps(col[1], _mm_set_ps1(in[1]));
	__m128 tmp2 = _mm_mul_ps(col[2], _mm_set_ps1(in[2]));
	__m128 tmp3 = _mm_mul_ps(col[3], _mm_set_ps1(in[3]));

	tmp0 = _mm_add_ps(tmp0, tmp1);
	tmp2 = _mm_add_ps(tmp2, tmp3);
	_mm_storeu_ps(out, _mm_add_ps(tmp0, tmp2));

#else

	const float x = (m[0] * in[0]) + (m[4] * in[1]) + (m[ 8] * in[2]) + (m[12] * in[3]);
	const float y = (m[1] * in[0]) + (m[5] * in[1]) + (m[ 9] * in[2]) + (m[13] * in[3]);
	const float z = (m[2] * in[0]) + (m[6] * in[1]) + (m[10] * in[2]) + (m[14] * in[3]);
	const float w = (m[3] * in[0]) + (m[7] * in[1]) + (m[11] * in[2]) + (m[15] * in[3]);

	out[0] = x;
	out[1] = y;
	out[2] = z;
	out[3] = w;

#endif // ENABLE_SSE_INTRINSICS
}

void Matrix4x4::TransformPoint(const Vec3 & in, Vec3 & out) const
{
#if (ENABLE_SSE_INTRINSICS)

	__m128 tmp0 = _mm_mul_ps(col[0], _mm_set_ps1(in.x));
	__m128 tmp1 = _mm_mul_ps(col[1], _mm_set_ps1(in.y));
	__m128 tmp2 = _mm_mul_ps(col[2], _mm_set_ps1(in.z));
	__m128 tmp3 = col[3]; // in.w is assumed to be 1.0

	tmp0 = _mm_add_ps(tmp0, tmp1);
	tmp2 = _mm_add_ps(tmp2, tmp3);
	tmp3 = _mm_add_ps(tmp0, tmp2);

	out.x = tmp3.m128_f32[0];
	out.y = tmp3.m128_f32[1];
	out.z = tmp3.m128_f32[2];

#else

	const float x = (m[0] * in.x) + (m[4] * in.y) + (m[ 8] * in.z) + m[12];
	const float y = (m[1] * in.x) + (m[5] * in.y) + (m[ 9] * in.z) + m[13];
	const float z = (m[2] * in.x) + (m[6] * in.y) + (m[10] * in.z) + m[14];
	// in.w is assumed to be 1.0

	out.x = x;
	out.y = y;
	out.z = z;

#endif // ENABLE_SSE_INTRINSICS
}

void Matrix4x4::Translate(const Vec3 &trans)
{
	LoadIdentity();

	m[12] = trans.x;
	m[13] = trans.y;
	m[14] = trans.z;
}

void Matrix4x4::TranslateX(float dist)
{
	LoadIdentity();

	m[12] = dist;
}

void Matrix4x4::TranslateY(float dist)
{
	LoadIdentity();

	m[13] = dist;
}

void Matrix4x4::TranslateZ(float dist)
{
	LoadIdentity();

	m[14] = dist;
}

void Matrix4x4::Rotate(float angle, const Vec3 & axis)
{
	float s, c;
	Math::SineCosine(angle, s, c);

	Vec3 v(axis);
	v.Normalize();

	float ux = v.x;
	float uy = v.y;
	float uz = v.z;

	m[0]  = c + (1-c) * ux;
	m[1]  = (1-c) * ux*uy + s*uz;
	m[2]  = (1-c) * ux*uz - s*uy;
	m[3]  = 0;

	m[4]  = (1-c) * uy*ux - s*uz;
	m[5]  = c + (1-c) * (uy * uy);
	m[6]  = (1-c) * uy*uz + s*ux;
	m[7]  = 0;

	m[8]  = (1-c) * uz*ux + s*uy;
	m[9]  = (1-c) * uz*uz - s*ux;
	m[10] = c + (1-c) * (uz * uz);
	m[11] = 0;

	m[12] = 0;
	m[13] = 0;
	m[14] = 0;
	m[15] = 1;
}

void Matrix4x4::RotateX(float angle)
{
	float s, c;
	Math::SineCosine(angle, s, c);

	LoadIdentity();

	m[5]  =  c;
	m[6]  =  s;
	m[9]  = -s;
	m[10] =  c;
}

void Matrix4x4::RotateY(float angle)
{
	float s, c;
	Math::SineCosine(angle, s, c);

	LoadIdentity();

	m[0]  =  c;
	m[2]  = -s;
	m[8]  =  s;
	m[10] =  c;
}

void Matrix4x4::RotateZ(float angle)
{
	float s, c;
	Math::SineCosine(angle, s, c);

	LoadIdentity();

	m[0] =  c;
	m[1] =  s;
	m[4] = -s;
	m[5] =  c;
}

void Matrix4x4::Scale(const Vec3 & Scale)
{
	LoadIdentity();

	m[0]  = Scale.x;
	m[5]  = Scale.y;
	m[10] = Scale.z;
}

Matrix4x4 Matrix4x4::operator + (const Matrix4x4 & other) const
{
	Matrix4x4 result;

	result.m[0]  = m[0]  + other.m[0];
	result.m[1]  = m[1]  + other.m[1];
	result.m[2]  = m[2]  + other.m[2];
	result.m[3]  = m[3]  + other.m[3];

	result.m[4]  = m[4]  + other.m[4];
	result.m[5]  = m[5]  + other.m[5];
	result.m[6]  = m[6]  + other.m[6];
	result.m[7]  = m[7]  + other.m[7];

	result.m[8]  = m[8]  + other.m[8];
	result.m[9]  = m[9]  + other.m[9];
	result.m[10] = m[10] + other.m[10];
	result.m[11] = m[11] + other.m[11];

	result.m[12] = m[12] + other.m[12];
	result.m[13] = m[13] + other.m[13];
	result.m[14] = m[14] + other.m[14];
	result.m[15] = m[15] + other.m[15];

	return (result);
}

Matrix4x4 Matrix4x4::operator - (const Matrix4x4 & other) const
{
	Matrix4x4 result;

	result.m[0]  = m[0]  - other.m[0];
	result.m[1]  = m[1]  - other.m[1];
	result.m[2]  = m[2]  - other.m[2];
	result.m[3]  = m[3]  - other.m[3];

	result.m[4]  = m[4]  - other.m[4];
	result.m[5]  = m[5]  - other.m[5];
	result.m[6]  = m[6]  - other.m[6];
	result.m[7]  = m[7]  - other.m[7];

	result.m[8]  = m[8]  - other.m[8];
	result.m[9]  = m[9]  - other.m[9];
	result.m[10] = m[10] - other.m[10];
	result.m[11] = m[11] - other.m[11];

	result.m[12] = m[12] - other.m[12];
	result.m[13] = m[13] - other.m[13];
	result.m[14] = m[14] - other.m[14];
	result.m[15] = m[15] - other.m[15];

	return (result);
}

Matrix4x4 Matrix4x4::operator * (const Matrix4x4 & other) const
{
	Matrix4x4 result;

#if (ENABLE_SSE_INTRINSICS)

	register int i;
	__m128 tmp0, tmp1;
	float * r = result.m;

	for (i = 0; i < 16; i += 4)
	{
		tmp0 = _mm_set_ps1(other.m[i]);
		tmp1 = _mm_mul_ps(col[0], tmp0);

		tmp0 = _mm_set_ps1(other.m[i + 1]);
		tmp1 = _mm_add_ps(_mm_mul_ps(col[1], tmp0), tmp1);

		tmp0 = _mm_set_ps1(other.m[i + 2]);
		tmp1 = _mm_add_ps(_mm_mul_ps(col[2], tmp0), tmp1);

		tmp0 = _mm_set_ps1(other.m[i + 3]);
		tmp1 = _mm_add_ps(_mm_mul_ps(col[3], tmp0), tmp1);

		_mm_store_ps(&r[i], tmp1);
	}

#else

	result.m[0]  = (m[0]*other.m[0])+(m[4]*other.m[1])+(m[8]*other.m[2])+(m[12]*other.m[3]);
    result.m[1]  = (m[1]*other.m[0])+(m[5]*other.m[1])+(m[9]*other.m[2])+(m[13]*other.m[3]);
	result.m[2]  = (m[2]*other.m[0])+(m[6]*other.m[1])+(m[10]*other.m[2])+(m[14]*other.m[3]);
	result.m[3]  = (m[3]*other.m[0])+(m[7]*other.m[1])+(m[11]*other.m[2])+(m[15]*other.m[3]);

	result.m[4]  = (m[0]*other.m[4])+(m[4]*other.m[5])+(m[8]*other.m[6])+(m[12]*other.m[7]);
	result.m[5]  = (m[1]*other.m[4])+(m[5]*other.m[5])+(m[9]*other.m[6])+(m[13]*other.m[7]);
	result.m[6]  = (m[2]*other.m[4])+(m[6]*other.m[5])+(m[10]*other.m[6])+(m[14]*other.m[7]);
    result.m[7]  = (m[3]*other.m[4])+(m[7]*other.m[5])+(m[11]*other.m[6])+(m[15]*other.m[7]);

	result.m[8]  = (m[0]*other.m[8])+(m[4]*other.m[9])+(m[8]*other.m[10])+(m[12]*other.m[11]);
    result.m[9]  = (m[1]*other.m[8])+(m[5]*other.m[9])+(m[9]*other.m[10])+(m[13]*other.m[11]);
	result.m[10] = (m[2]*other.m[8])+(m[6]*other.m[9])+(m[10]*other.m[10])+(m[14]*other.m[11]);
	result.m[11] = (m[3]*other.m[8])+(m[7]*other.m[9])+(m[11]*other.m[10])+(m[15]*other.m[11]);

	result.m[12] = (m[0]*other.m[12])+(m[4]*other.m[13])+(m[8]*other.m[14])+(m[12]*other.m[15]);
	result.m[13] = (m[1]*other.m[12])+(m[5]*other.m[13])+(m[9]*other.m[14])+(m[13]*other.m[15]);
	result.m[14] = (m[2]*other.m[12])+(m[6]*other.m[13])+(m[10]*other.m[14])+(m[14]*other.m[15]);
	result.m[15] = (m[3]*other.m[12])+(m[7]*other.m[13])+(m[11]*other.m[14])+(m[15]*other.m[15]);

#endif // ENABLE_SSE_INTRINSICS

	return (result);
}

Matrix4x4 Matrix4x4::operator * (float scalar) const
{
	Matrix4x4 result;

	result.m[0]  = m[0]  * scalar;
	result.m[1]  = m[1]  * scalar;
	result.m[2]  = m[2]  * scalar;
	result.m[3]  = m[3]  * scalar;

	result.m[4]  = m[4]  * scalar;
	result.m[5]  = m[5]  * scalar;
	result.m[6]  = m[6]  * scalar;
	result.m[7]  = m[7]  * scalar;

	result.m[8]  = m[8]  * scalar;
	result.m[9]  = m[9]  * scalar;
	result.m[10] = m[10] * scalar;
	result.m[11] = m[11] * scalar;

	result.m[12] = m[12] * scalar;
	result.m[13] = m[13] * scalar;
	result.m[14] = m[14] * scalar;
	result.m[15] = m[15] * scalar;

	return (result);
}